%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
protein = pd.read_csv("nuclear.csv")
# mean & stdev of each class
print('Mean(cm): \n', protein.groupby(['class']).mean())
print('-'*80,'\n')
print('Stdev(cm): \n', protein.groupby(['class']).std())
print('-'*80)
proteinData = [protein.iloc[:,1:18], protein.iloc[:,11:21], protein.iloc[:,21:31], protein.iloc[:,31:41],
protein.iloc[:,41:51], protein.iloc[:,51:61], protein.iloc[:,61:71], protein.iloc[:,71:78]]
#print(proteinData)
proteinClass = protein.iloc[:, 78:82]
#print(proteinClass)
# 挑出要分析的 attribute,和 class
sub_protein = pd.concat([proteinData[0],proteinClass], axis=1)
sub_protein.describe(percentiles=[])
na_cols = sub_protein.columns[sub_protein.isna().any()].tolist()
print(na_cols)
## drop
proteinClean = sub_protein.dropna()
#print(proteinClean)
## fill missing value with mean of each group
fill_protein = sub_protein.copy()
for n in na_cols:
fill_protein[n] = fill_protein.groupby(['class'], sort=False)[n].apply(lambda x: x.fillna(x.mean()))
#protein[x] = protein.groupby("class").transform(lambda x: x.fillna(x.mean()))
#for c in proteinClass['class'].unique():
#print(c)
#proteinFill = protein.fillna(,inplace=True)
# 印出有 missing value 的 column
## 未處理 missing value 的原資料
na_cols = sub_protein.columns[sub_protein.isna().any()].tolist()
print(na_cols)
## fill_protein 已用平均值填補完,沒有 missing value,印出 empty list
na_cols = fill_protein.columns[fill_protein.isna().any()].tolist()
print(na_cols)
sub_protein.describe(percentiles=[], include='all')
print(list(fill_protein.iloc[:,0:5]))
fill_protein.iloc[:,0:5].plot()
fig, ax = plt.subplots(figsize=(8, 6))
plt.suptitle('')
fill_protein.boxplot(column=['DYRK1A_N'], by='class', ax=ax)
fill_protein.iloc[:,0:4].hist(figsize=(8,6))
fill_protein.groupby('Genotype').hist(column=['DYRK1A_N'])
sns.pairplot(fill_protein, hue='class')
fill_protein.plot(kind='hist')
Relation = pd.concat([fill_protein['NR1_N'], fill_protein['pNR1_N']], axis=1)
Relation = pd.concat([Relation, proteinClass], axis=1)
sns.pairplot(Relation, hue='class')
sns.pairplot(Relation, hue='Genotype')
sns.pairplot(Relation, hue='Treatment', palette="husl")
sns.pairplot(Relation, hue='Behavior', palette="Set2")
Relation2 = pd.concat([fill_protein['DYRK1A_N'], fill_protein['ITSN1_N']], axis=1)
Relation2 = pd.concat([Relation2, proteinClass], axis=1)
sns.pairplot(Relation2, hue='class')
人眼對於二維圖形的大小比例關係並無法判斷的很精準, 也許判斷相對大小沒問題,但判斷絕對大小卻很困難。
google "pie chart" 其實也可看到很多爭論。
fill_protein.describe()
Q1 = fill_protein.quantile(q=0.25)
Q3 = fill_protein.quantile(q=0.75)
IQR = Q3-Q1
row_name=['Q3', 'Q1', 'IQR']
IQR_DF = pd.DataFrame([Q3, Q1, IQR], row_name)
IQR_DF.assign()
fill_protein.iloc[:,0:5].sort_values(by=['NR1_N']).head()
fill_protein.iloc[:,0:5].sort_values(by=['DYRK1A_N', 'ITSN1_N']).head()
pcorr=fill_protein.iloc[:,0:10].corr()
pcorr.assign()
pcorr.style.background_gradient().set_precision(2)
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(pcorr, mask=np.zeros_like(pcorr, dtype=np.bool), cmap="Blues",
square=True, ax=ax)
fig, ax = plt.subplots(figsize=(8, 6))
plt.suptitle('')
fill_protein.boxplot(column=['DYRK1A_N'], by='class', ax=ax)
fig, ax = plt.subplots(figsize=(8, 6))
plt.suptitle('')
fill_protein.boxplot(column=['DYRK1A_N'], by='Genotype', ax=ax)
sns.set(style="white", palette="muted", color_codes=True)
# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(9,9))
sns.distplot(fill_protein['DYRK1A_N'], color="b", ax=axes[0, 0])
sns.distplot(fill_protein['ITSN1_N'], color="r", ax=axes[0, 1])
sns.distplot(fill_protein['BDNF_N'], color="g", ax=axes[1, 0])
sns.distplot(fill_protein['NR1_N'], color="m", ax=axes[1, 1])
#plt.setp(axes)
#plt.tight_layout()
plt.show()